##############################################################
# ESTIMATING CONSTITUENCY PREFERENCES FROM SPARSE SURVEY DATA
# USING AUXILLIARY GEOGRAPHIC INFORMATION
#
#		         -- R and WinBUGS code--
#      -- Extension of Park, Gelman, Bafumi (2004) --
#
# July 2011
# PETER SELB, SIMON MUNZERT
# UNIVERSITY OF KONSTANZ, GERMANY
##############################################################


##############################################################
### Preliminary remark
# Code for data preparation was taken from http://www.stat.columbia.edu/~gelman/arm/examples/election88/


# Set working directory (supplementary data has to be stored in this folder)
setwd("C:/Uni/Research/ConstituencyPrefs/Supplementary Data")

# Load libraries
library(maptools)
library(sp)
library(spdep)
library(RColorBrewer)
library(R2WinBUGS)
library(foreign)
library(arm)
library(gpclib)


##############################################################
# Data preparation
# -----------------------------------------------------------

data(state)                  # "state" is an R data file
state.abbr <- c (state.abb[1:8], "DC", state.abb[9:50])
dc <- 9
not.dc <- c(1:8,10:51)
region <- c(3,4,4,3,4,4,1,1,5,3,3,4,4,2,2,2,2,3,3,1,1,1,2,2,3,2,4,2,4,1,1,4,1,3,2,2,3,4,1,1,3,2,3,3,4,1,3,4,1,2,4)

# Load in data from the CBS polls in 1988
# Data are at http://www.stat.columbia.edu/~gelman/arm/examples/election88
polls <- read.dta ("polls.dta")
attach.all (polls)

# Select just the data from the last survey (#9158)
ok <- survey==9158            # define the condition
polls.subset <- polls[ok,]    # select the subset of interest
attach.all (polls.subset)     # attach the subset
write.table (polls.subset, "polls.subset.dat")

# define other data summaries
y <- bush                  # 1 if support bush, 0 if support dukakis
n <- length(y)             # of survey respondents
n.age <- max(age)          # of age categories
n.edu <- max(edu)          # of education categories
n.state <- max(state)      # of states
n.region <- max(region)    # of regions

# compute unweighted and weighted averages for the U.S.
ok <- !is.na(y) # remove the undecideds
cat ("national mean of raw data:", round (mean(y[ok]==1), 3), "\n")
cat ("national weighted mean of raw data:",
     round (sum((weight*y)[ok])/sum(weight[ok]), 3), "\n")

# compute weighted averages for the states
raw.weighted <- rep (NA, n.state)
names (raw.weighted) <- state.abbr
for (i in 1:n.state){
  ok <- !is.na(y) & state==i
  raw.weighted[i] <- sum ((weight*y)[ok])/sum(weight[ok])
}

# load in 1988 election data as a validation check
election88 <- read.dta("election88.dta")
outcome <- election88$electionresult

# load in 1988 census data
census <- read.dta("census88.dta")

# also include a measure of previous vote as a state-level predictor
presvote <- read.dta("presvote.dta")
attach (presvote)
v.prev <- presvote$g76_84pr
not.dc <- c(1:8,10:51)
candidate.effects <- read.table ("candidate_effects.dat", header=T)
v.prev[not.dc] <- v.prev[not.dc] +
 (candidate.effects$X76 + candidate.effects$X80 + candidate.effects$X84)/3
# Data are at http://www.stat.columbia.edu/~gelman/arm/examples/election88


##############################################################
#  Create spatial data frame

# Read in US states shape file
# Shape data from http://www.colorado.edu/geography/foote/maps/assign/hotspots/hotspots.html

usa <- readShapePoly("STATES.shp")
summary(usa)
plot(usa)

# Union districts entailing multiple polygons
gpclibPermit()
sp.districts <- unionSpatialPolygons(usa, usa$STATE_ABBR)
plot(sp.districts, col="grey")

# Create spatial polygon data frame
df.area <- merge(election88, usa, by.x=c("st"), by.y=c("STATE_ABBR"),all=T)
rownames(df.area) <- df.area$st
spdf.districts2 <- SpatialPolygonsDataFrame(sp.districts,df.area)
spdf.districts <- spdf.districts2[order(spdf.districts2$stnum),]
spdf.districts$region <- c(3,4,4,3,4,4,1,1,5,3,3,4,4,2,2,2,2,3,3,1,1,1,2,2,3,2,4,2,4,1,1,4,1,3,2,2,3,4,1,1,3,2,3,3,4,1,3,4,1,2,4)
plot(spdf.districts)
invisible(text(getSpPPolygonsLabptSlots(spdf.districts), labels=as.character(spdf.districts$region), cex=0.5))

# Create list of neighbors within region bounds
region <- c(3,4,4,3,4,4,1,1,5,3,3,4,4,2,2,2,2,3,3,1,1,1,2,2,3,2,4,2,4,1,1,4,1,3,2,2,3,4,1,1,3,2,3,3,4,1,3,4,1,2,4)
spdf.districts$region <- region
nb.districts <- poly2nb(spdf.districts)
nb <- unlist(nb.districts)
adj <- nb[nb!=0]
num <- card(nb.districts)

# Create technical vector including neighbor times region number of districts
region.stretch <- NULL
for (i in 1:length(region)) {
reg.vec <-(rep(region[i],num[i]))
region.stretch <- c(region.stretch, reg.vec)
}

# Create technical vector including region number of neighbors
nb.region <- NA
for (i in 1:length(adj)) { nb.region[i] <- region[adj[i]] }

nb.region.same <- NULL
for (i in 1:length(region.stretch)) {
nb.region.same[i] <- ifelse(nb.region[i] == region.stretch[i], 1, 0)
}

adj.within.region <- adj[nb.region.same==1]
weight.within.region <- rep(1, times=length(adj.within.region))
num <- card(nb.districts)

num.within.region <- NULL
numn <- 0
for (i in 1:length(num)) {
num.within.region[i] <- sum(nb.region.same[(numn+1):(num[i]+numn)])
numn <- num[i]+numn
}
num.within.region <- ifelse(num==0,0,num.within.region)
num.within.region
sum(num.within.region)

# Spatial data for WinBUGS
adj <- adj.within.region
adj
weight <- weight.within.region
weight
num <- num.within.region
num
spdata <- list("adj","weight", "num")



##############################################################
# Replicate Park et al.'s (2004) analysis adding spatial REs

# Pre-define spatially structured REs for states with no neighbors
v = ifelse(num.within.region==0, 0, NA)

# Set up the predictors
age.edu <- n.edu*(age-1) + edu
v.prev.full <- v.prev[state]

# Idiosyncratic state effect
dir.mu <- rep(NA,n.state)
state.list <- list(state)
dir.mu <- aggregate(y, by=state.list, mean, na.rm=T)
state.ids <- as.data.frame(seq(1,51,by=1))
state.ids$state.ids <-  seq(1,51,by=1)
rownames(state.ids) <-  seq(1,51,by=1)
dir.mu.new <- merge(dir.mu, state.ids, by.x="Group.1", by.y="state.ids", all=T)
dir.mu.state <- as.vector(dir.mu.new$x)
u.raw = ifelse(!is.na(dir.mu.state), NA, 0)
# No. of non-missing states
ns <- length(as.vector(dir.mu$Group.1))
# IDs of non-missing states
s <- as.vector(dir.mu$Group.1)

# BUGS Model for election88, including spatial RE
model.pgb.spatial {
  for (i in 1:n){
    y[i] ~ dbin (p.bound[i], 1)
    p.bound[i] <- max(0, min(1, p[i]))
    logit(p[i]) <- Xbeta[i]
    Xbeta[i] <- b.female*female[i] + b.black*black[i] +
      b.female.black*female[i]*black[i] +
      b.age[age[i]] + b.edu[edu[i]] + b.age.edu[age[i],edu[i]] +
      b.state[state[i]] + v[state[i]]
  }

  b.female ~ dnorm (0, .0001)
  b.black ~ dnorm (0, .0001)
  b.female.black ~ dnorm (0, .0001)

  for (j in 1:n.age){
    b.age[j] <- xi.age*(b.age.raw[j] - mean(b.age.raw[]))
    b.age.raw[j] ~ dnorm (0, tau.age.raw)
  }
  for (j in 1:n.edu){
    b.edu[j] <- xi.edu*(b.edu.raw[j] - mean(b.edu.raw[]))
    b.edu.raw[j] ~ dnorm (0, tau.edu.raw)
  }
  for (j in 1:n.age){
    for (k in 1:n.edu){
    b.age.edu[j,k] <- xi.age.edu*(b.age.edu.raw[j,k] - mean(b.age.edu.raw[,]))
    b.age.edu.raw[j,k] ~ dnorm(0, tau.age.edu.raw)
    }
  }

  for (j in 1:n.state){
    b.state[j] <- xi.state*(b.state.raw[j] - mean(b.state.raw[]))
    b.state.raw[j] <- b.state.0.raw + b.v.prev.raw*v.prev[j] + u.raw[j]
  }
  
  for(j in 1:ns){
  u[s[j]] <- xi.state*u.raw[s[j]]
  u.raw[s[j]] ~ dnorm (0, tau.u.raw)
  }  
  
  b.v.prev <- xi.state*b.v.prev.raw
  b.v.prev.raw ~ dnorm(0, .0001)  
  b.state.0  <- xi.state*b.state.0.raw
  b.state.0.raw  ~ dnorm(0, .0001)
    
  v[1:n.state] ~ car.normal(adj[], weight[], num[], tauv)
    
  tau.age.raw <- pow(sigma.age.raw, -2)
  tau.edu.raw <- pow(sigma.edu.raw, -2)
  tau.age.edu.raw <- pow(sigma.age.edu.raw, -2)
  tau.u.raw <- pow(sigma.u.raw, -2)  
  tauv <- pow(sigmav, -2)

  xi.age ~ dunif (0, 100)
  xi.edu ~ dunif (0, 100)
  xi.age.edu ~ dunif (0, 100)
  xi.state ~ dunif (0, 100)
  
  sigma.age.raw ~ dunif (0, 100)
  sigma.edu.raw ~ dunif (0, 100)
  sigma.age.edu.raw ~ dunif (0, 100)
  sigma.u.raw ~ dunif (0, 100)
  sigmav ~ dunif(0, 100) 
  
  sigma.age <- xi.age*sigma.age.raw
  sigma.edu <- xi.edu*sigma.edu.raw
  sigma.age.edu <- xi.age.edu*sigma.age.edu.raw
  sigma.u <- xi.state*sigma.u.raw

}

write.model(model.pgb.spatial, "modelpgbspatial.bug")

# Multilevel logistic regression
data <- list ("n", "n.age", "n.edu", "n.state", "y", "female", "black", "age", "edu", "state",  "v.prev", "v", "u.raw","ns","s")
 
inits <- function () {list(b.state.0.raw=rnorm(1), b.female=rnorm(1), b.black=rnorm(1), b.female.black=rnorm(1), b.age.raw=rnorm(n.age), b.edu.raw=rnorm(n.edu), u.raw=ifelse(is.na(dir.mu.state), NA, rnorm(n.state,0,1)),  b.v.prev.raw=rnorm(1), v=ifelse(num==0, NA, rnorm(n.state,0,1)), sigma.age.raw=runif(1), sigma.edu.raw=runif(1), sigma.age.edu.raw=runif(1), sigma.u.raw=runif(1), xi.state=runif(1), xi.age=runif(1),xi.edu=runif(1),xi.age.edu=runif(1),sigmav=runif(1,0,1))
}

params <- c ("b.female", "b.black", "b.female.black", "b.age", "b.edu", "b.age.edu", "b.state.0", "b.state", "b.v.prev","v", "sigma.age", "sigma.edu", "sigma.age.edu", "sigma.u", "sigmav", "u")
 
M2spatial.bugs <- bugs(data=c(data,spdata), inits, params, "modelpgbspatial.bug", n.chains=3, n.iter=30000, n.burnin=20000,  n.thin=10, DIC=F, bugs.directory="C:/Program Files (x86)/WinBUGS14", debug=TRUE)

plot(M2spatial.bugs)





##############################################################
### Literature

# Park, David K., Andrew Gelman and Joseph Bafumi. 2004. "Bayesian Multilevel Estimation with Poststratification: State-Level Estimates from National Polls." Political Analysis 12:375�385.